In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1807]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected PLUG
In [1808]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1809]:
pd.set_option('display.max_colwidth', None)
In [1810]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1811]:
del df['Unnamed: 0']
In [1812]:
df.head(5)
Out[1812]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2017-02-02 1.01 1.02 0.91 0.92 0.92 5960000 -9.803918 0.383241 0.004935 0.067238 1.203352 0.922362 1.062857 NaN 7.308449 0.11 86.290315 NaN NaN NaN -0.26 NaN -0.220339 21.249706 NaN NaN 13.736841 17.432961 -2.311204e+07 -6.119928e+06 -34057800.0 174736.518000 138199.374629 0.0 174736.518000 174736.518000 0.0 174736.518000 0.0 138199.374629 138199.374629 0.0 138199.374629 0 18 0 18 18 0 18 0
1 2017-02-03 0.95 0.95 0.93 0.94 0.94 2321900 2.173911 -0.534752 0.005710 0.061918 1.186846 0.884583 1.035714 NaN 6.587027 0.03 86.290315 NaN NaN NaN -0.24 NaN -0.203390 25.089532 NaN NaN 10.439166 14.115306 -2.311204e+07 -6.195975e+06 -31735900.0 130854.534800 103493.048183 0.0 130854.534800 130854.534800 0.0 130854.534800 0.0 103493.048183 103493.048183 0.0 103493.048183 0 64 0 64 64 0 64 0
2 2017-02-06 0.94 0.95 0.93 0.93 0.93 2331100 -1.063829 -0.175448 0.005220 0.055930 1.151648 0.862638 1.007143 NaN 6.013952 0.02 86.290315 NaN NaN NaN -0.21 NaN -0.184211 24.447656 NaN NaN 9.698425 11.291478 -2.544314e+07 -6.405528e+06 -34067000.0 259083.900200 204909.845943 0.0 259083.900200 259083.900200 0.0 259083.900200 0.0 204909.845943 204909.845943 0.0 204909.845943 0 45 0 45 45 0 45 0
3 2017-02-07 0.93 0.93 0.90 0.90 0.90 2213000 -3.225810 -0.304172 0.004053 0.052226 1.103042 0.848387 0.975714 NaN 5.802835 0.03 88.598807 NaN NaN NaN -0.26 NaN -0.224138 22.581220 NaN NaN 8.249151 9.462247 -2.765614e+07 -6.613070e+06 -36280000.0 148855.876867 117730.336671 0.0 148855.876867 148855.876867 0.0 148855.876867 0.0 117730.336671 117730.336671 0.0 117730.336671 0 25 0 25 25 0 25 0
4 2017-02-08 0.91 0.92 0.86 0.88 0.88 3716500 -2.222220 -0.133539 0.003686 0.053336 1.071420 0.828580 0.950000 -0.113013 6.060926 0.06 90.965274 NaN NaN NaN -0.25 -9.96721 -0.221239 21.407834 NaN NaN 7.870364 8.605980 -2.889498e+07 -6.490961e+06 -39996500.0 274465.888601 217075.483681 0.0 274465.888601 274465.888601 0.0 274465.888601 0.0 217075.483681 217075.483681 0.0 217075.483681 0 52 0 52 52 0 52 0
In [1813]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1229 entries, 0 to 1228
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       1229 non-null   datetime64[ns]
 1   Open                       1229 non-null   float64       
 2   High                       1229 non-null   float64       
 3   Low                        1229 non-null   float64       
 4   Close                      1229 non-null   float64       
 5   Adj Close                  1229 non-null   float64       
 6   Volume                     1229 non-null   int64         
 7   Return                     1229 non-null   float64       
 8   Beta                       1229 non-null   float64       
 9   Variance                   1229 non-null   float64       
 10  AvgTrueRange               1229 non-null   float64       
 11  Upperband                  1229 non-null   float64       
 12  Lowerband                  1229 non-null   float64       
 13  Middleband                 1229 non-null   float64       
 14  APO                        1225 non-null   float64       
 15  NATR                       1229 non-null   float64       
 16  TRANGE                     1229 non-null   float64       
 17  DMI                        1229 non-null   float64       
 18  MACD                       1217 non-null   float64       
 19  MACDSIGNAL                 1217 non-null   float64       
 20  MACDHIST                   1217 non-null   float64       
 21  MOM                        1229 non-null   float64       
 22  PPO                        1225 non-null   float64       
 23  ROCP                       1229 non-null   float64       
 24  RSI                        1229 non-null   float64       
 25  TRIX                       1162 non-null   float64       
 26  ULTOSC                     1222 non-null   float64       
 27  SLOWK                      1229 non-null   float64       
 28  SLOWD                      1229 non-null   float64       
 29  AD                         1229 non-null   float64       
 30  ADOSC                      1229 non-null   float64       
 31  OBV                        1229 non-null   float64       
 32  Upward_momentum_created    1229 non-null   float64       
 33  Downward_momentum_created  1229 non-null   float64       
 34  B5_O_Um                    1229 non-null   float64       
 35  B5_C_Um                    1229 non-null   float64       
 36  B5_E_Um                    1229 non-null   float64       
 37  B5_A_Um                    1229 non-null   float64       
 38  B5_N_Um                    1229 non-null   float64       
 39  B5_O_Dm                    1229 non-null   float64       
 40  B5_C_Dm                    1229 non-null   float64       
 41  B5_E_Dm                    1229 non-null   float64       
 42  B5_A_Dm                    1229 non-null   float64       
 43  B5_N_Dm                    1229 non-null   float64       
 44  Verified_status_True       1229 non-null   int64         
 45  Verified_status_False      1229 non-null   int64         
 46  O                          1229 non-null   int64         
 47  C                          1229 non-null   int64         
 48  E                          1229 non-null   int64         
 49  A                          1229 non-null   int64         
 50  N                          1229 non-null   int64         
 51  Real_or_Fake_tweet         1229 non-null   int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 499.4 KB
In [1814]:
df.shape
Out[1814]:
(1229, 52)
In [1815]:
sns.set(font_scale=0.8)
In [1816]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1817]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1818]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1819]:
df.head()
Out[1819]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2017-02-02 1.01 1.02 0.91 0.92 0.92 5960000 -9.803918 0.383241 0.004935 0.067238 1.203352 0.922362 1.062857 NaN 7.308449 0.11 86.290315 NaN NaN NaN -0.26 NaN -0.220339 21.249706 NaN NaN 13.736841 17.432961 -2.311204e+07 -6.119928e+06 -34057800.0 174736.518000 138199.374629 0.0 174736.518000 174736.518000 0.0 174736.518000 0.0 138199.374629 138199.374629 0.0 138199.374629 0 18 0 18 18 0 18 0 NaN NaN
1 2017-02-03 0.95 0.95 0.93 0.94 0.94 2321900 2.173911 -0.534752 0.005710 0.061918 1.186846 0.884583 1.035714 NaN 6.587027 0.03 86.290315 NaN NaN NaN -0.24 NaN -0.203390 25.089532 NaN NaN 10.439166 14.115306 -2.311204e+07 -6.195975e+06 -31735900.0 130854.534800 103493.048183 0.0 130854.534800 130854.534800 0.0 130854.534800 0.0 103493.048183 103493.048183 0.0 103493.048183 0 64 0 64 64 0 64 0 2.173911 0.021506
2 2017-02-06 0.94 0.95 0.93 0.93 0.93 2331100 -1.063829 -0.175448 0.005220 0.055930 1.151648 0.862638 1.007143 NaN 6.013952 0.02 86.290315 NaN NaN NaN -0.21 NaN -0.184211 24.447656 NaN NaN 9.698425 11.291478 -2.544314e+07 -6.405528e+06 -34067000.0 259083.900200 204909.845943 0.0 259083.900200 259083.900200 0.0 259083.900200 0.0 204909.845943 204909.845943 0.0 204909.845943 0 45 0 45 45 0 45 0 -1.063829 -0.010695
3 2017-02-07 0.93 0.93 0.90 0.90 0.90 2213000 -3.225810 -0.304172 0.004053 0.052226 1.103042 0.848387 0.975714 NaN 5.802835 0.03 88.598807 NaN NaN NaN -0.26 NaN -0.224138 22.581220 NaN NaN 8.249151 9.462247 -2.765614e+07 -6.613070e+06 -36280000.0 148855.876867 117730.336671 0.0 148855.876867 148855.876867 0.0 148855.876867 0.0 117730.336671 117730.336671 0.0 117730.336671 0 25 0 25 25 0 25 0 -3.225810 -0.032790
4 2017-02-08 0.91 0.92 0.86 0.88 0.88 3716500 -2.222220 -0.133539 0.003686 0.053336 1.071420 0.828580 0.950000 -0.113013 6.060926 0.06 90.965274 NaN NaN NaN -0.25 -9.96721 -0.221239 21.407834 NaN NaN 7.870364 8.605980 -2.889498e+07 -6.490961e+06 -39996500.0 274465.888601 217075.483681 0.0 274465.888601 274465.888601 0.0 274465.888601 0.0 217075.483681 217075.483681 0.0 217075.483681 0 52 0 52 52 0 52 0 -2.222220 -0.022473
In [1820]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1821]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1822]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1823]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1824]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1825]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1826]:
df.describe()
Out[1826]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1.162000e+03 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1.162000e+03 1.162000e+03 1.162000e+03 1.162000e+03 1.162000e+03 1162.0 1.162000e+03 1.162000e+03 1162.0 1.162000e+03 1162.0 1.162000e+03 1.162000e+03 1162.0 1.162000e+03 1162.000000 1162.000000 1162.0 1162.000000 1162.000000 1162.0 1162.000000 1162.0 1162.000000 1162.000000 1133.000000 1126.000000
mean 11.051411 11.433348 10.628003 11.041816 11.041816 1.403482e+07 0.336977 0.674348 1.626164 0.869643 12.146329 9.796496 10.971413 0.209092 6.434257 0.883709 35.217283 0.208783 0.214295 -0.005511 0.257341 1.689076 0.036138 52.537530 0.278960 50.378608 50.678326 50.697808 3.611977e+08 3.758567e+06 5.303308e+08 1.052013e+06 8.320383e+05 0.0 1.052013e+06 1.052013e+06 0.0 1.052013e+06 0.0 8.320383e+05 8.320383e+05 0.0 8.320383e+05 0.713425 122.587780 0.0 123.301205 123.301205 0.0 123.301205 0.0 0.337339 0.002212 0.045064 0.045057
std 14.739007 15.294377 14.103327 14.717598 14.717598 1.644401e+07 4.877626 0.519098 8.969261 1.384153 16.457646 12.961182 14.638802 2.155206 2.409189 1.541489 23.165046 1.600007 1.518631 0.442316 3.987780 7.823507 0.173229 12.394653 0.628753 9.617377 23.663775 22.027388 4.865686e+08 1.375174e+07 5.736336e+08 1.991892e+06 1.575390e+06 0.0 1.991892e+06 1.991892e+06 0.0 1.991892e+06 0.0 1.575390e+06 1.575390e+06 0.0 1.575390e+06 2.397524 200.367379 0.0 202.290350 202.290350 0.0 202.290350 0.0 4.877678 0.047879 0.018384 0.018440
min 1.020000 1.050000 0.990000 1.010000 1.010000 8.237000e+05 -18.343198 -2.910594 0.000135 0.046701 1.368452 0.884241 1.145714 -9.356923 2.818802 0.020000 0.012125 -4.832393 -4.348631 -3.035643 -20.990002 -18.032081 -0.405462 9.444984 -1.012364 13.269339 2.748066 4.236643 -3.719281e+07 -3.332090e+07 7.426350e+07 2.607878e+02 2.062575e+02 0.0 2.607878e+02 2.607878e+02 0.0 2.607878e+02 0.0 2.062575e+02 2.062575e+02 0.0 2.062575e+02 0.000000 1.000000 0.0 1.000000 1.000000 0.0 1.000000 0.0 -18.343198 -0.202645 0.013383 0.013383
25% 2.040000 2.092500 2.000000 2.050000 2.050000 3.066025e+06 -2.051286 0.395755 0.002136 0.097930 2.169980 1.935919 2.056071 -0.073269 4.618563 0.090000 16.787928 -0.035355 -0.031561 -0.029847 -0.207500 -2.464132 -0.065240 44.388527 -0.122706 43.722427 30.930758 33.134768 2.394282e+07 -1.935261e+06 1.236811e+08 8.075976e+04 6.387302e+04 0.0 8.075976e+04 8.075976e+04 0.0 8.075976e+04 0.0 6.387302e+04 6.387302e+04 0.0 6.387302e+04 0.000000 17.000000 0.0 17.000000 17.000000 0.0 17.000000 0.0 -2.051286 -0.020726 0.029084 0.029042
50% 2.720000 2.775000 2.610000 2.690000 2.690000 7.267350e+06 0.000000 0.653611 0.012094 0.160173 2.852455 2.508429 2.685714 0.014103 5.940081 0.180000 30.357014 0.012151 0.015130 0.002601 0.010000 0.521904 0.004990 51.013764 0.129062 50.152793 50.599612 51.126511 8.098261e+07 1.030555e+05 1.813854e+08 3.271393e+05 2.587350e+05 0.0 3.271393e+05 3.271393e+05 0.0 3.271393e+05 0.0 2.587350e+05 2.587350e+05 0.0 2.587350e+05 0.000000 36.000000 0.0 36.000000 36.000000 0.0 36.000000 0.0 0.000000 0.000000 0.043437 0.043324
75% 16.507501 17.340000 16.207500 16.865001 16.865001 1.994042e+07 2.294676 0.925920 0.591278 1.183878 18.494226 14.019353 16.323214 0.227035 7.757176 1.139999 51.960124 0.181843 0.175323 0.032990 0.390000 5.673471 0.105854 61.057683 0.584100 56.467981 69.962282 68.593983 5.393663e+08 4.542557e+06 9.017251e+08 1.233321e+06 9.754354e+05 0.0 1.233321e+06 1.233321e+06 0.0 1.233321e+06 0.0 9.754354e+05 9.754354e+05 0.0 9.754354e+05 1.000000 174.750000 0.0 175.000000 175.000000 0.0 175.000000 0.0 2.294676 0.022687 0.057217 0.057401
max 70.300003 75.489998 67.260002 73.180000 73.180000 1.530927e+08 35.114288 3.738913 170.627173 7.629902 78.938433 63.463137 66.152857 15.897052 19.343576 12.310001 97.559731 10.039432 9.156542 3.448520 37.040001 33.307508 1.141097 89.876747 2.465484 80.047317 96.698876 93.973365 1.394406e+09 1.156000e+08 1.844566e+09 2.556945e+07 2.022292e+07 0.0 2.556945e+07 2.556945e+07 0.0 2.556945e+07 0.0 2.022292e+07 2.022292e+07 0.0 2.022292e+07 35.000000 2314.000000 0.0 2338.000000 2338.000000 0.0 2338.000000 0.0 35.114288 0.300951 0.093125 0.093125
In [1827]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1828]:
df = df.fillna(df.median())
In [1829]:
df.isna().sum()
Out[1829]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1830]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1162 entries, 67 to 1228
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       1162 non-null   datetime64[ns]
 1   Open                       1162 non-null   float64       
 2   High                       1162 non-null   float64       
 3   Low                        1162 non-null   float64       
 4   Close                      1162 non-null   float64       
 5   Adj Close                  1162 non-null   float64       
 6   Volume                     1162 non-null   int64         
 7   Return                     1162 non-null   float64       
 8   Beta                       1162 non-null   float64       
 9   Variance                   1162 non-null   float64       
 10  AvgTrueRange               1162 non-null   float64       
 11  Upperband                  1162 non-null   float64       
 12  Lowerband                  1162 non-null   float64       
 13  Middleband                 1162 non-null   float64       
 14  APO                        1162 non-null   float64       
 15  NATR                       1162 non-null   float64       
 16  TRANGE                     1162 non-null   float64       
 17  DMI                        1162 non-null   float64       
 18  MACD                       1162 non-null   float64       
 19  MACDSIGNAL                 1162 non-null   float64       
 20  MACDHIST                   1162 non-null   float64       
 21  MOM                        1162 non-null   float64       
 22  PPO                        1162 non-null   float64       
 23  ROCP                       1162 non-null   float64       
 24  RSI                        1162 non-null   float64       
 25  TRIX                       1162 non-null   float64       
 26  ULTOSC                     1162 non-null   float64       
 27  SLOWK                      1162 non-null   float64       
 28  SLOWD                      1162 non-null   float64       
 29  AD                         1162 non-null   float64       
 30  ADOSC                      1162 non-null   float64       
 31  OBV                        1162 non-null   float64       
 32  Upward_momentum_created    1162 non-null   float64       
 33  Downward_momentum_created  1162 non-null   float64       
 34  B5_O_Um                    1162 non-null   float64       
 35  B5_C_Um                    1162 non-null   float64       
 36  B5_E_Um                    1162 non-null   float64       
 37  B5_A_Um                    1162 non-null   float64       
 38  B5_N_Um                    1162 non-null   float64       
 39  B5_O_Dm                    1162 non-null   float64       
 40  B5_C_Dm                    1162 non-null   float64       
 41  B5_E_Dm                    1162 non-null   float64       
 42  B5_A_Dm                    1162 non-null   float64       
 43  B5_N_Dm                    1162 non-null   float64       
 44  Verified_status_True       1162 non-null   int64         
 45  Verified_status_False      1162 non-null   int64         
 46  O                          1162 non-null   int64         
 47  C                          1162 non-null   int64         
 48  E                          1162 non-null   int64         
 49  A                          1162 non-null   int64         
 50  N                          1162 non-null   int64         
 51  Fake_news                  1162 non-null   int64         
 52  returns                    1162 non-null   float64       
 53  log_returns                1162 non-null   float64       
 54  vol_current                1162 non-null   float64       
 55  vol_future                 1162 non-null   float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 517.5 KB
In [1831]:
df.shape
Out[1831]:
(1162, 56)
In [1832]:
df=df.dropna()
In [1833]:
df.dtypes
Out[1833]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1834]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1834]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f07731c8c50>
In [1835]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1836]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 29 strongly correlated values with AvgTrueRange:
AvgTrueRange                 1.000000
Upperband                    0.965242
Middleband                   0.952797
High                         0.952675
Close                        0.949584
Adj Close                    0.949584
Open                         0.949400
Low                          0.944727
Lowerband                    0.926612
TRANGE                       0.917058
OBV                          0.854842
AD                           0.814414
Verified_status_False        0.713144
N                            0.712088
E                            0.712088
C                            0.712088
Volume                       0.681994
vol_current                  0.586920
NATR                         0.579738
vol_future                   0.573180
B5_C_Um                      0.558440
B5_N_Um                      0.558440
B5_E_Um                      0.558440
Upward_momentum_created      0.558440
B5_N_Dm                      0.558440
Downward_momentum_created    0.558440
B5_C_Dm                      0.558440
B5_E_Dm                      0.558440
Variance                     0.518497
Name: AvgTrueRange, dtype: float64
In [1837]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 6 strongly correlated values with NATR :
NATR            1.000000
vol_future      0.840723
vol_current     0.803646
AvgTrueRange    0.579738
Volume          0.560546
TRANGE          0.532417
Name: NATR, dtype: float64
In [1838]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 30 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.917058
Upperband                    0.883040
High                         0.875248
Close                        0.869932
Adj Close                    0.869932
Open                         0.866312
Middleband                   0.858810
Low                          0.857216
Verified_status_False        0.828699
N                            0.828177
E                            0.828177
C                            0.828177
Lowerband                    0.818685
OBV                          0.775837
Volume                       0.775386
AD                           0.731037
Upward_momentum_created      0.705363
B5_C_Um                      0.705363
B5_E_Um                      0.705363
B5_N_Um                      0.705363
Downward_momentum_created    0.705363
B5_C_Dm                      0.705363
B5_E_Dm                      0.705363
B5_N_Dm                      0.705363
Verified_status_True         0.620656
Variance                     0.594791
NATR                         0.532417
vol_current                  0.531723
vol_future                   0.529320
Name: TRANGE, dtype: float64
In [1839]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Openness:
Series([], Name: O, dtype: float64)
In [1840]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with conscientiousness:
N                            1.000000
C                            1.000000
E                            1.000000
Verified_status_False        0.999975
B5_N_Um                      0.927496
Upward_momentum_created      0.927496
B5_C_Um                      0.927496
B5_E_Um                      0.927496
B5_C_Dm                      0.927496
B5_E_Dm                      0.927496
Downward_momentum_created    0.927496
B5_N_Dm                      0.927496
Volume                       0.869134
TRANGE                       0.828177
Verified_status_True         0.804179
Close                        0.736924
Adj Close                    0.736924
High                         0.734684
Low                          0.725658
Open                         0.724803
Upperband                    0.723493
AvgTrueRange                 0.712088
Middleband                   0.693379
OBV                          0.687214
Lowerband                    0.647586
AD                           0.644185
Variance                     0.588205
ADOSC                        0.565885
Name: C, dtype: float64
In [1841]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with conscientiousness:
N                            1.000000
C                            1.000000
E                            1.000000
Verified_status_False        0.999975
B5_N_Um                      0.927496
Upward_momentum_created      0.927496
B5_C_Um                      0.927496
B5_E_Um                      0.927496
B5_C_Dm                      0.927496
B5_E_Dm                      0.927496
Downward_momentum_created    0.927496
B5_N_Dm                      0.927496
Volume                       0.869134
TRANGE                       0.828177
Verified_status_True         0.804179
Close                        0.736924
Adj Close                    0.736924
High                         0.734684
Low                          0.725658
Open                         0.724803
Upperband                    0.723493
AvgTrueRange                 0.712088
Middleband                   0.693379
OBV                          0.687214
Lowerband                    0.647586
AD                           0.644185
Variance                     0.588205
ADOSC                        0.565885
Name: E, dtype: float64
In [1842]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1843]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with conscientiousness:
N                            1.000000
C                            1.000000
E                            1.000000
Verified_status_False        0.999975
B5_N_Um                      0.927496
Upward_momentum_created      0.927496
B5_C_Um                      0.927496
B5_E_Um                      0.927496
B5_C_Dm                      0.927496
B5_E_Dm                      0.927496
Downward_momentum_created    0.927496
B5_N_Dm                      0.927496
Volume                       0.869134
TRANGE                       0.828177
Verified_status_True         0.804179
Close                        0.736924
Adj Close                    0.736924
High                         0.734684
Low                          0.725658
Open                         0.724803
Upperband                    0.723493
AvgTrueRange                 0.712088
Middleband                   0.693379
OBV                          0.687214
Lowerband                    0.647586
AD                           0.644185
Variance                     0.588205
ADOSC                        0.565885
Name: N, dtype: float64
In [1844]:
df.columns
Out[1844]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1845]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1846]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with B5_C_Um:
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Upward_momentum_created      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
E                            0.927496
N                            0.927496
C                            0.927496
Verified_status_False        0.926159
Verified_status_True         0.855633
Volume                       0.836859
TRANGE                       0.705363
ADOSC                        0.567488
Adj Close                    0.562701
Close                        0.562701
High                         0.559879
AvgTrueRange                 0.558440
Low                          0.549684
Open                         0.548767
Upperband                    0.547932
OBV                          0.522328
Middleband                   0.516257
Name: B5_C_Um, dtype: float64
In [1847]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with B5_E_Um:
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Upward_momentum_created      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
E                            0.927496
N                            0.927496
C                            0.927496
Verified_status_False        0.926159
Verified_status_True         0.855633
Volume                       0.836859
TRANGE                       0.705363
ADOSC                        0.567488
Adj Close                    0.562701
Close                        0.562701
High                         0.559879
AvgTrueRange                 0.558440
Low                          0.549684
Open                         0.548767
Upperband                    0.547932
OBV                          0.522328
Middleband                   0.516257
Name: B5_E_Um, dtype: float64
In [1848]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1849]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with B5_N_Um:
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Upward_momentum_created      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
E                            0.927496
N                            0.927496
C                            0.927496
Verified_status_False        0.926159
Verified_status_True         0.855633
Volume                       0.836859
TRANGE                       0.705363
ADOSC                        0.567488
Adj Close                    0.562701
Close                        0.562701
High                         0.559879
AvgTrueRange                 0.558440
Low                          0.549684
Open                         0.548767
Upperband                    0.547932
OBV                          0.522328
Middleband                   0.516257
Name: B5_N_Um, dtype: float64

Downward momentum correlation

In [1850]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Dm:
Series([], Name: B5_O_Dm, dtype: float64)
In [1851]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with B5_C_Dm:
Upward_momentum_created      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
E                            0.927496
N                            0.927496
C                            0.927496
Verified_status_False        0.926159
Verified_status_True         0.855633
Volume                       0.836859
TRANGE                       0.705363
ADOSC                        0.567488
Adj Close                    0.562701
Close                        0.562701
High                         0.559879
AvgTrueRange                 0.558440
Low                          0.549684
Open                         0.548767
Upperband                    0.547932
OBV                          0.522328
Middleband                   0.516257
Name: B5_C_Dm, dtype: float64
In [1852]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with B5_E_Dm:
Upward_momentum_created      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
E                            0.927496
N                            0.927496
C                            0.927496
Verified_status_False        0.926159
Verified_status_True         0.855633
Volume                       0.836859
TRANGE                       0.705363
ADOSC                        0.567488
Adj Close                    0.562701
Close                        0.562701
High                         0.559879
AvgTrueRange                 0.558440
Low                          0.549684
Open                         0.548767
Upperband                    0.547932
OBV                          0.522328
Middleband                   0.516257
Name: B5_E_Dm, dtype: float64
In [1853]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1854]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with B5_N_Dm:
Upward_momentum_created      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
E                            0.927496
N                            0.927496
C                            0.927496
Verified_status_False        0.926159
Verified_status_True         0.855633
Volume                       0.836859
TRANGE                       0.705363
ADOSC                        0.567488
Adj Close                    0.562701
Close                        0.562701
High                         0.559879
AvgTrueRange                 0.558440
Low                          0.549684
Open                         0.548767
Upperband                    0.547932
OBV                          0.522328
Middleband                   0.516257
Name: B5_N_Dm, dtype: float64
In [1855]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Real_or_Fake_tweet :
Series([], Name: Fake_news, dtype: float64)
In [1856]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with Downward_momentum_created :
Upward_momentum_created      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
E                            0.927496
N                            0.927496
C                            0.927496
Verified_status_False        0.926159
Verified_status_True         0.855633
Volume                       0.836859
TRANGE                       0.705363
ADOSC                        0.567488
Adj Close                    0.562701
Close                        0.562701
High                         0.559879
AvgTrueRange                 0.558440
Low                          0.549684
Open                         0.548767
Upperband                    0.547932
OBV                          0.522328
Middleband                   0.516257
Name: Downward_momentum_created, dtype: float64
In [1857]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with Upward_momentum_created :
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Upward_momentum_created      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
E                            0.927496
N                            0.927496
C                            0.927496
Verified_status_False        0.926159
Verified_status_True         0.855633
Volume                       0.836859
TRANGE                       0.705363
ADOSC                        0.567488
Adj Close                    0.562701
Close                        0.562701
High                         0.559879
AvgTrueRange                 0.558440
Low                          0.549684
Open                         0.548767
Upperband                    0.547932
OBV                          0.522328
Middleband                   0.516257
Name: Upward_momentum_created, dtype: float64
In [1858]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_N_Dm                      0.855633
B5_E_Dm                      0.855633
B5_C_Dm                      0.855633
Downward_momentum_created    0.855633
B5_N_Um                      0.855633
B5_E_Um                      0.855633
B5_C_Um                      0.855633
Upward_momentum_created      0.855633
N                            0.804179
E                            0.804179
C                            0.804179
Verified_status_False        0.799931
Volume                       0.695063
TRANGE                       0.620656
Variance                     0.535044
Name: Verified_status_True, dtype: float64
In [1859]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
N                            0.999975
C                            0.999975
E                            0.999975
B5_N_Um                      0.926159
Upward_momentum_created      0.926159
B5_C_Um                      0.926159
B5_E_Um                      0.926159
B5_C_Dm                      0.926159
B5_E_Dm                      0.926159
Downward_momentum_created    0.926159
B5_N_Dm                      0.926159
Volume                       0.869158
TRANGE                       0.828699
Verified_status_True         0.799931
Close                        0.738492
Adj Close                    0.738492
High                         0.736243
Low                          0.727256
Open                         0.726370
Upperband                    0.724953
AvgTrueRange                 0.713144
Middleband                   0.694977
OBV                          0.688913
Lowerband                    0.649341
AD                           0.645725
Variance                     0.587448
ADOSC                        0.566053
Name: Verified_status_False, dtype: float64
In [1860]:
sns.set(font_scale=0.8)
In [1861]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1862]:
df.dtypes
Out[1862]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1863]:
df.isnull().sum()
Out[1863]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1864]:
df.fillna(0, inplace = True)
In [1865]:
df.dropna(inplace=True)
In [1866]:
sns.set(font_scale=0.8)
In [1867]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1868]:
df.describe()
Out[1868]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1.162000e+03 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1162.000000 1.162000e+03 1.162000e+03 1.162000e+03 1.162000e+03 1.162000e+03 1162.0 1.162000e+03 1.162000e+03 1162.0 1.162000e+03 1162.0 1.162000e+03 1.162000e+03 1162.0 1.162000e+03 1162.000000 1162.000000 1162.0 1162.000000 1162.000000 1162.0 1162.000000 1162.0 1162.000000 1162.000000 1162.000000 1162.000000
mean 11.051411 11.433348 10.628003 11.041816 11.041816 1.403482e+07 0.336977 0.674348 1.626164 0.869643 12.146329 9.796496 10.971413 0.209092 6.434257 0.883709 35.217283 0.208783 0.214295 -0.005511 0.257341 1.689076 0.036138 52.537530 0.278960 50.378608 50.678326 50.697808 3.611977e+08 3.758567e+06 5.303308e+08 1.052013e+06 8.320383e+05 0.0 1.052013e+06 1.052013e+06 0.0 1.052013e+06 0.0 8.320383e+05 8.320383e+05 0.0 8.320383e+05 0.713425 122.587780 0.0 123.301205 123.301205 0.0 123.301205 0.0 0.337339 0.002212 0.045024 0.045003
std 14.739007 15.294377 14.103327 14.717598 14.717598 1.644401e+07 4.877626 0.519098 8.969261 1.384153 16.457646 12.961182 14.638802 2.155206 2.409189 1.541489 23.165046 1.600007 1.518631 0.442316 3.987780 7.823507 0.173229 12.394653 0.628753 9.617377 23.663775 22.027388 4.865686e+08 1.375174e+07 5.736336e+08 1.991892e+06 1.575390e+06 0.0 1.991892e+06 1.991892e+06 0.0 1.991892e+06 0.0 1.575390e+06 1.575390e+06 0.0 1.575390e+06 2.397524 200.367379 0.0 202.290350 202.290350 0.0 202.290350 0.0 4.877678 0.047879 0.018155 0.018154
min 1.020000 1.050000 0.990000 1.010000 1.010000 8.237000e+05 -18.343198 -2.910594 0.000135 0.046701 1.368452 0.884241 1.145714 -9.356923 2.818802 0.020000 0.012125 -4.832393 -4.348631 -3.035643 -20.990002 -18.032081 -0.405462 9.444984 -1.012364 13.269339 2.748066 4.236643 -3.719281e+07 -3.332090e+07 7.426350e+07 2.607878e+02 2.062575e+02 0.0 2.607878e+02 2.607878e+02 0.0 2.607878e+02 0.0 2.062575e+02 2.062575e+02 0.0 2.062575e+02 0.000000 1.000000 0.0 1.000000 1.000000 0.0 1.000000 0.0 -18.343198 -0.202645 0.013383 0.013383
25% 2.040000 2.092500 2.000000 2.050000 2.050000 3.066025e+06 -2.051286 0.395755 0.002136 0.097930 2.169980 1.935919 2.056071 -0.073269 4.618563 0.090000 16.787928 -0.035355 -0.031561 -0.029847 -0.207500 -2.464132 -0.065240 44.388527 -0.122706 43.722427 30.930758 33.134768 2.394282e+07 -1.935261e+06 1.236811e+08 8.075976e+04 6.387302e+04 0.0 8.075976e+04 8.075976e+04 0.0 8.075976e+04 0.0 6.387302e+04 6.387302e+04 0.0 6.387302e+04 0.000000 17.000000 0.0 17.000000 17.000000 0.0 17.000000 0.0 -2.051286 -0.020726 0.029200 0.029200
50% 2.720000 2.775000 2.610000 2.690000 2.690000 7.267350e+06 0.000000 0.653611 0.012094 0.160173 2.852455 2.508429 2.685714 0.014103 5.940081 0.180000 30.357014 0.012151 0.015130 0.002601 0.010000 0.521904 0.004990 51.013764 0.129062 50.152793 50.599612 51.126511 8.098261e+07 1.030555e+05 1.813854e+08 3.271393e+05 2.587350e+05 0.0 3.271393e+05 3.271393e+05 0.0 3.271393e+05 0.0 2.587350e+05 2.587350e+05 0.0 2.587350e+05 0.000000 36.000000 0.0 36.000000 36.000000 0.0 36.000000 0.0 0.000000 0.000000 0.043437 0.043324
75% 16.507501 17.340000 16.207500 16.865001 16.865001 1.994042e+07 2.294676 0.925920 0.591278 1.183878 18.494226 14.019353 16.323214 0.227035 7.757176 1.139999 51.960124 0.181843 0.175323 0.032990 0.390000 5.673471 0.105854 61.057683 0.584100 56.467981 69.962282 68.593983 5.393663e+08 4.542557e+06 9.017251e+08 1.233321e+06 9.754354e+05 0.0 1.233321e+06 1.233321e+06 0.0 1.233321e+06 0.0 9.754354e+05 9.754354e+05 0.0 9.754354e+05 1.000000 174.750000 0.0 175.000000 175.000000 0.0 175.000000 0.0 2.294676 0.022687 0.056604 0.056604
max 70.300003 75.489998 67.260002 73.180000 73.180000 1.530927e+08 35.114288 3.738913 170.627173 7.629902 78.938433 63.463137 66.152857 15.897052 19.343576 12.310001 97.559731 10.039432 9.156542 3.448520 37.040001 33.307508 1.141097 89.876747 2.465484 80.047317 96.698876 93.973365 1.394406e+09 1.156000e+08 1.844566e+09 2.556945e+07 2.022292e+07 0.0 2.556945e+07 2.556945e+07 0.0 2.556945e+07 0.0 2.022292e+07 2.022292e+07 0.0 2.022292e+07 35.000000 2314.000000 0.0 2338.000000 2338.000000 0.0 2338.000000 0.0 35.114288 0.300951 0.093125 0.093125
In [1869]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1870]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1871]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1878]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected PLUG
In [1879]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1880]:
df.columns
Out[1880]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1881]:
df.shape
Out[1881]:
(1229, 52)
In [1882]:
df.isnull().sum()
Out[1882]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           4
NATR                          0
TRANGE                        0
DMI                           0
MACD                         12
MACDSIGNAL                   12
MACDHIST                     12
MOM                           0
PPO                           4
ROCP                          0
RSI                           0
TRIX                         67
ULTOSC                        7
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1883]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1884]:
df_weekly = df.resample('W').agg('mean')
In [1885]:
df_weekly.shape
Out[1885]:
(256, 51)
In [1886]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1886]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077e981f90>
In [1887]:
sns.set(font_scale=0.8)
In [1888]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1889]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with AvgTrueRange:
AvgTrueRange                 1.000000
TRANGE                       0.967656
Upperband                    0.967619
High                         0.956737
Middleband                   0.956576
Open                         0.954374
Close                        0.954151
Adj Close                    0.954151
Low                          0.950271
Lowerband                    0.934673
OBV                          0.859777
AD                           0.821340
Verified_status_False        0.813485
E                            0.813222
C                            0.813222
N                            0.813222
Volume                       0.729908
Downward_momentum_created    0.681136
B5_C_Dm                      0.681136
B5_E_Dm                      0.681136
B5_N_Dm                      0.681136
Upward_momentum_created      0.681136
B5_C_Um                      0.681136
B5_E_Um                      0.681136
B5_N_Um                      0.681136
Verified_status_True         0.671858
NATR                         0.552609
Variance                     0.551840
Name: AvgTrueRange, dtype: float64
In [1890]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 4 strongly correlated values with NATR :
NATR            1.000000
Volume          0.606174
AvgTrueRange    0.552609
TRANGE          0.526993
Name: NATR, dtype: float64
In [1891]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.967656
Upperband                    0.941849
High                         0.931642
Close                        0.928171
Adj Close                    0.928171
Open                         0.927116
Low                          0.921925
Middleband                   0.921461
Lowerband                    0.887989
Verified_status_False        0.878905
E                            0.878887
N                            0.878887
C                            0.878887
OBV                          0.830726
AD                           0.785102
Volume                       0.784007
Downward_momentum_created    0.769628
B5_C_Dm                      0.769628
B5_E_Dm                      0.769628
B5_N_Dm                      0.769628
Upward_momentum_created      0.769628
B5_C_Um                      0.769628
B5_E_Um                      0.769628
B5_N_Um                      0.769628
Verified_status_True         0.754019
Variance                     0.645315
NATR                         0.526993
Name: TRANGE, dtype: float64
In [1892]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Openness:
Series([], Name: O, dtype: float64)
In [1893]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with conscientiousness:
N                            1.000000
C                            1.000000
E                            1.000000
Verified_status_False        0.999988
B5_N_Um                      0.941757
Upward_momentum_created      0.941757
Downward_momentum_created    0.941757
B5_C_Um                      0.941757
B5_E_Um                      0.941757
B5_C_Dm                      0.941757
B5_E_Dm                      0.941757
B5_N_Dm                      0.941757
Volume                       0.903508
TRANGE                       0.878887
Verified_status_True         0.861294
Close                        0.838054
Adj Close                    0.838054
High                         0.837742
Upperband                    0.833190
Open                         0.832505
Low                          0.832121
AvgTrueRange                 0.813222
Middleband                   0.808039
OBV                          0.787604
Lowerband                    0.769455
AD                           0.744052
Variance                     0.636448
ADOSC                        0.619996
Name: C, dtype: float64
In [1894]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with conscientiousness:
N                            1.000000
C                            1.000000
E                            1.000000
Verified_status_False        0.999988
B5_N_Um                      0.941757
Upward_momentum_created      0.941757
Downward_momentum_created    0.941757
B5_C_Um                      0.941757
B5_E_Um                      0.941757
B5_C_Dm                      0.941757
B5_E_Dm                      0.941757
B5_N_Dm                      0.941757
Volume                       0.903508
TRANGE                       0.878887
Verified_status_True         0.861294
Close                        0.838054
Adj Close                    0.838054
High                         0.837742
Upperband                    0.833190
Open                         0.832505
Low                          0.832121
AvgTrueRange                 0.813222
Middleband                   0.808039
OBV                          0.787604
Lowerband                    0.769455
AD                           0.744052
Variance                     0.636448
ADOSC                        0.619996
Name: E, dtype: float64
In [1895]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1896]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with conscientiousness:
N                            1.000000
C                            1.000000
E                            1.000000
Verified_status_False        0.999988
B5_N_Um                      0.941757
Upward_momentum_created      0.941757
Downward_momentum_created    0.941757
B5_C_Um                      0.941757
B5_E_Um                      0.941757
B5_C_Dm                      0.941757
B5_E_Dm                      0.941757
B5_N_Dm                      0.941757
Volume                       0.903508
TRANGE                       0.878887
Verified_status_True         0.861294
Close                        0.838054
Adj Close                    0.838054
High                         0.837742
Upperband                    0.833190
Open                         0.832505
Low                          0.832121
AvgTrueRange                 0.813222
Middleband                   0.808039
OBV                          0.787604
Lowerband                    0.769455
AD                           0.744052
Variance                     0.636448
ADOSC                        0.619996
Name: N, dtype: float64
In [1897]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1898]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with B5_C_Um:
B5_C_Um                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Um                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
E                            0.941757
N                            0.941757
C                            0.941757
Verified_status_False        0.941159
Volume                       0.903201
Verified_status_True         0.872454
TRANGE                       0.769628
Close                        0.685335
Adj Close                    0.685335
High                         0.684924
AvgTrueRange                 0.681136
Upperband                    0.678678
Open                         0.678067
Low                          0.677136
ADOSC                        0.672206
Middleband                   0.648801
OBV                          0.642056
Lowerband                    0.605532
AD                           0.586463
Variance                     0.573433
Name: B5_C_Um, dtype: float64
In [1899]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with B5_E_Um:
B5_C_Um                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Um                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
E                            0.941757
N                            0.941757
C                            0.941757
Verified_status_False        0.941159
Volume                       0.903201
Verified_status_True         0.872454
TRANGE                       0.769628
Close                        0.685335
Adj Close                    0.685335
High                         0.684924
AvgTrueRange                 0.681136
Upperband                    0.678678
Open                         0.678067
Low                          0.677136
ADOSC                        0.672206
Middleband                   0.648801
OBV                          0.642056
Lowerband                    0.605532
AD                           0.586463
Variance                     0.573433
Name: B5_E_Um, dtype: float64
In [1900]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1901]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with B5_N_Um:
B5_C_Um                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Um                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
E                            0.941757
N                            0.941757
C                            0.941757
Verified_status_False        0.941159
Volume                       0.903201
Verified_status_True         0.872454
TRANGE                       0.769628
Close                        0.685335
Adj Close                    0.685335
High                         0.684924
AvgTrueRange                 0.681136
Upperband                    0.678678
Open                         0.678067
Low                          0.677136
ADOSC                        0.672206
Middleband                   0.648801
OBV                          0.642056
Lowerband                    0.605532
AD                           0.586463
Variance                     0.573433
Name: B5_N_Um, dtype: float64

Downward momentum correlation

In [1902]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Dm:
Series([], Name: B5_O_Dm, dtype: float64)
In [1903]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with B5_C_Dm:
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_C_Um                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Um                      1.000000
E                            0.941757
N                            0.941757
C                            0.941757
Verified_status_False        0.941159
Volume                       0.903201
Verified_status_True         0.872454
TRANGE                       0.769628
Close                        0.685335
Adj Close                    0.685335
High                         0.684924
AvgTrueRange                 0.681136
Upperband                    0.678678
Open                         0.678067
Low                          0.677136
ADOSC                        0.672206
Middleband                   0.648801
OBV                          0.642056
Lowerband                    0.605532
AD                           0.586463
Variance                     0.573433
Name: B5_C_Dm, dtype: float64
In [1904]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with B5_E_Dm:
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_C_Um                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Um                      1.000000
E                            0.941757
N                            0.941757
C                            0.941757
Verified_status_False        0.941159
Volume                       0.903201
Verified_status_True         0.872454
TRANGE                       0.769628
Close                        0.685335
Adj Close                    0.685335
High                         0.684924
AvgTrueRange                 0.681136
Upperband                    0.678678
Open                         0.678067
Low                          0.677136
ADOSC                        0.672206
Middleband                   0.648801
OBV                          0.642056
Lowerband                    0.605532
AD                           0.586463
Variance                     0.573433
Name: B5_E_Dm, dtype: float64
In [1905]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1906]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with B5_N_Dm:
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_C_Um                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Um                      1.000000
E                            0.941757
N                            0.941757
C                            0.941757
Verified_status_False        0.941159
Volume                       0.903201
Verified_status_True         0.872454
TRANGE                       0.769628
Close                        0.685335
Adj Close                    0.685335
High                         0.684924
AvgTrueRange                 0.681136
Upperband                    0.678678
Open                         0.678067
Low                          0.677136
ADOSC                        0.672206
Middleband                   0.648801
OBV                          0.642056
Lowerband                    0.605532
AD                           0.586463
Variance                     0.573433
Name: B5_N_Dm, dtype: float64
In [1907]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Real_or_Fake_tweet :
Series([], Name: Fake_news, dtype: float64)
In [1908]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with Downward_momentum_created :
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_C_Um                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Um                      1.000000
E                            0.941757
N                            0.941757
C                            0.941757
Verified_status_False        0.941159
Volume                       0.903201
Verified_status_True         0.872454
TRANGE                       0.769628
Close                        0.685335
Adj Close                    0.685335
High                         0.684924
AvgTrueRange                 0.681136
Upperband                    0.678678
Open                         0.678067
Low                          0.677136
ADOSC                        0.672206
Middleband                   0.648801
OBV                          0.642056
Lowerband                    0.605532
AD                           0.586463
Variance                     0.573433
Name: Downward_momentum_created, dtype: float64
In [1909]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with Upward_momentum_created :
B5_C_Um                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Um                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
E                            0.941757
N                            0.941757
C                            0.941757
Verified_status_False        0.941159
Volume                       0.903201
Verified_status_True         0.872454
TRANGE                       0.769628
Close                        0.685335
Adj Close                    0.685335
High                         0.684924
AvgTrueRange                 0.681136
Upperband                    0.678678
Open                         0.678067
Low                          0.677136
ADOSC                        0.672206
Middleband                   0.648801
OBV                          0.642056
Lowerband                    0.605532
AD                           0.586463
Variance                     0.573433
Name: Upward_momentum_created, dtype: float64
In [1910]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_C_Um                      0.872454
Upward_momentum_created      0.872454
B5_N_Um                      0.872454
B5_E_Um                      0.872454
Downward_momentum_created    0.872454
B5_N_Dm                      0.872454
B5_E_Dm                      0.872454
B5_C_Dm                      0.872454
E                            0.861294
N                            0.861294
C                            0.861294
Verified_status_False        0.858813
Volume                       0.805721
TRANGE                       0.754019
Variance                     0.692518
AvgTrueRange                 0.671858
Upperband                    0.648235
High                         0.642501
Close                        0.642476
Adj Close                    0.642476
Open                         0.635525
Low                          0.633293
Middleband                   0.609667
OBV                          0.576786
ADOSC                        0.575422
Lowerband                    0.555687
AD                           0.550870
Name: Verified_status_True, dtype: float64
In [1911]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
N                            0.999988
C                            0.999988
E                            0.999988
B5_N_Um                      0.941159
Upward_momentum_created      0.941159
B5_C_Um                      0.941159
B5_E_Um                      0.941159
B5_C_Dm                      0.941159
B5_E_Dm                      0.941159
Downward_momentum_created    0.941159
B5_N_Dm                      0.941159
Volume                       0.903234
TRANGE                       0.878905
Verified_status_True         0.858813
Close                        0.838803
Adj Close                    0.838803
High                         0.838488
Upperband                    0.833845
Open                         0.833275
Low                          0.832910
AvgTrueRange                 0.813485
Middleband                   0.808856
OBV                          0.788567
Lowerband                    0.770470
AD                           0.744904
Variance                     0.635059
ADOSC                        0.619592
Name: Verified_status_False, dtype: float64
In [1912]:
sns.set(font_scale=0.8)
In [1913]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [1914]:
df_weekly.fillna(0, inplace = True)
In [1915]:
df_weekly.dropna(inplace=True)
In [1916]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [1917]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();